cap log close
log using "$log/03_clean_data.log", replace

use "$data\full_with_E_new.dta", clear

count

rename wage tentgelt

bysort persnr year: egen c = count(betnr)
su c
keep if c == 1
drop c

save "$temp/singeltons", replace

********************************************************************************
********************************************************************************

use "$data\full_with_E_new.dta", clear

bysort persnr year: egen c = count(betnr)
su c
drop if c == 1
drop c

rename wage tentgelt

*** Clean data (delete parallel spell, keep main employment spell)
********************************************************************************
** Clean data within establishment / year

* 1) Drop certain duplicates
duplicates drop persnr betnr tentgelt begorig endorig pt, force

* 2) Delete (short) enclosed spells & same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
bysort persnr year betnr: egen max_spell = max(spell_length)
gen tag = (spell_length == max_spell)
drop max_spell
bysort persnr year betnr: egen count_tag = sum(tag)
replace tag = 0 if count_tag > 1
drop count_tag
gen beg = begorig if tag == 1
gen end = endorig if tag == 1
drop tag
bysort persnr year betnr: egen beg1 = max(beg)
bysort persnr year betnr: egen end1 = max(end)
drop beg end
gen beg = begorig
gen end = endorig 
drop if (begorig >= beg1 & endorig < end1) | ///
(begorig > beg1 & endorig <= end1) & !missing(beg1) & !missing(end1)
drop beg1 end1 beg end

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr betnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr betnr begorig endorig
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

* 4) Deal with parallel spells
* 4a) different wage, given ft/pt => keep spell with higher wage
duplicates tag persnr betnr begorig endorig pt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig pt: egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4b) different wage => keep spell with higher wage
duplicates tag persnr betnr begorig endorig   , g(tag)
tab tag, mis
bysort persnr betnr begorig endorig       : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4c) same wage: keep ft rather than pt
duplicates tag persnr betnr begorig endorig tentgelt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig tentgelt: egen m_pt = mean(pt) if tag > 0
drop if pt == 1 & tag > 0 & m_pt < 1 // parallel ft and pt spells, drop pt spell(s)
drop m_pt tag

* 4d) drop remaining duplicates 
duplicates drop persnr betnr tentgelt begorig endorig, force // drop duplicates within establishments

* 5) combine consecutive spells within ft and pt
sort persnr betnr begorig endorig pt
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
year(begorig) == year(begorig[_n-1]) & begorig - 1 == endorig[_n-1] & ///
pt == pt[_n-1])
gen tag2 = (tag[_n+1] == 1)
replace tag2 = 1 if tag == 1
replace tag2 = tag2 + tag2[_n-1] if tag2 != 0 & tag != 0 
su tag2
local max = (r(max) - 1)

forv i = 1/`max' {
gen duration = endorig - begorig + 1  
replace begorig = begorig[_n-1] if tag == 1 & tag2 == (`i' + 1)
replace tentgelt = (tentgelt * duration + tentgelt[_n-1] * duration[_n-1])/(duration + duration[_n-1]) if tag == 1 & tag2 == (`i' + 1) 
drop if tag2 == `i' & tag2[_n+1] == (`i' + 1)
drop duration
}
replace tentgelt = round(tentgelt,.01)
drop spell_length
gen spell_length = (endorig - begorig + 1)

drop tag tag2

********************************************************************************
** Assert that there are no parallel or overlapping spells within establishments

*Parallel spells within establishments
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells within establishments
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

append using "$temp/singeltons"
erase "$temp/singeltons.dta"

save "$data\full_with_E_new_clean.dta", replace

clear
cap log close
